;TODO: if I don't have T in cx, I can do dithering: fadd bx, dword-store to cx:ax, read ax right away
;TODO: I don't need 196 (C4) nor -512.3. They're used only in 'N=pd-(ro|p[i])=pd-ro.z|p[i].z=196+512*p[i].z' which is better

; Vector3: X grows right, Y down, Z forward.
; On the FP stack and in memory it looks like {Y X Z} (sometimes I need only Y).

org 100h ; assume al=0 bx=0 sp=di=-2 si=0100h bp=09??h
C19  dw 19                 ;=19 00  adc ax,[bx+si]
C196 dw 196    ; ax=13h    ;=C4 00  les ax,[bx+si]
RO_Z equ $-1-4 ; ray_origin.z = about -512.3 (don't care about LSbyte)
DI_ equ -2     ; pixel_adr@di = -2 (-3 would be correct but indistinguishable)

%define S(x) [byte x + si-100h]

P:int 10h      ; video mode, default palette

;Each frame: the visible pixels are A0000..AF9FF, I want X=0 Y=0 in the center
M:mov dx,0xA000-10-20-20-4 ;=0x9fca
  mov es,dx    ; dx:bx=YX:XX = 0x9fca:0

;Generate gem normals to p0..p19=[bp+200h,300h,...].
  pusha  ; adr:   -18 -16 -14 -12 -10  -8  -6  -4  -2
         ; stack:  di  si  bp  sp  bx  dx  cx  ax   0
C3211:   ; data:   -2 100 9??  -2  0  9fca T  key
  mov cx,[si]
G:add bp,si    ; i@cx = 19...1; bp points to p[19-i]; carry=0
  pusha
  fninit       ; clear FP stack
  fldln2
N:fchs
  loop N             ;|z=0.693*(-1^i)
  fild word[-6-16+di-DI_]  ; pushed i
  fsincos            ;|y=cos(i) x=sin(i) z   ; len=1.2167

;Do a bunch of rotations. (It doesn't need to be fast.)
  mov cl,16    ; j@bx=0..15
S:fld st2            ;|z y x z                     ;|x sz y x cz
  fild word[-6+di-DI_]
  fidiv word[bx+si]  ;|t=T/[19,-15360,196,...][j]
  fsincos            ;|c=cos(t) s=sin(t) z y x z   ;|c s x sz y x cz
  fmulp st5          ;|s z y x cz                  ;|s x sz y cx cz
  fmulp              ;|sz y x cz                   ;|sx sz y cx cz
  cmc
BIG equ $-3 ;=1928710622
  jc S
  fsubp st4          ;|sz y cx cz-sx
  faddp st2          ;|y cx+sz cz-sx
  fstp st3           ;|new.x=cx+sz .y=cz-sx .z=y
  inc bx
  loop S       ; bx=16
  fstp dword[bp+si]
  fstp dword[bp+si+4]
  fstp dword[bp+si+8]
  popa
  loop G
  popa

;Each pixel: cx=T dx:bx=YX:XX(init=9fca:0) di=adr(init=0)
X:inc dx       ; part of "dx:bx += 0x0000CCCD"
X2:
  fninit       ; adr:     -18 -16 -14 -12 -10  -8  -6  -4  -2
  pusha        ; stack:    di  si  bp  sp  bx  dx  cx  ax   0
  mov di,-4    ; s16:  pixadr 100 9??  -2  ..X..Y  T result

;Compute ray direction.
  fld1
  fild dword[di+4-11]
  fild dword S(BIG)
  fdiv st1,st0
  fidivr dword[di+4-10] ;|y=Y/BIG x=X/BIG z=1
  call GEM              ;|color
  fimul word S(C3211)
  fistp word[di+4-4] ; color*3211 -> pushed ax
  popa

; 4-bit builtin gray palette with cheap dithering.
  add al,bl
  mov al,16
  adc al,ah
;  adc al,0xF0 ; correct overflow
;  jnc O
;  salc
;O:add al,0x20

;; Faster version: draw each pixel twice.
;  stosb
;  add bx,0xCCCD ;dx:bx = YXX += 0000CCCD
;  adc dx,0

  stosb
  add bx,0xCCCD ;dx:bx = YXX += 0000CCCD
  jnc X2
  jnz X        ;do 65536 iterations

  inc cx       ; T++
  in al,60h
  dec al
  jnz M        ; fallthrough

GEM: ;Hit the gem. Front plane @ dx, back plane @ bp
  fild dword[si]     ;|tf=0 tb=HUGE=0xC40013 y x z
  fldz
  mov cx,[si]  ; i@cx = 19...1; bx points to p[i]
  lea bx,[bp+si]

;Ray-plane intersection.
I:                   ;|tf tb rd.y .x .z
;Dot product.
  add si,12    ;108 104 100
DP:add si,di   ;-4
  fld dword[bx+si]   ;|p[i].z ...
  fmul st5           ;|rd.z*p[i].z ...
  jpo DP             ;|(rd*p[i]).y .x .z tf tb rd.y .x .z
  faddp
  faddp              ;|D=(rd|p[i]) tf tb rd.y .x .z

  fst dword[bx+di]   ; -> p[i].dot_rd
  ftst
  fnstsw ax
  sahf         ; cf=1 if we're in front of the plane
  fld dword S(RO_Z)
  fmul dword[bx+si+8];|(ro|p[i]) D tf tb rd.y .x .z       ; ro.z=-512.3
  fisubr word S(C196);|N=pd-(ro|p[i]) D tf tb rd.y .x .z  ; pd=196
  fdivrp st1         ;|t=N/D tf tb rd.y .x .z
  jnc BACK
FRONT:
  fcom st1
  fnstsw ax
  sahf
  jb NEXT      ;if t>=tf { tf=t; pf@dx = current; }
  fst st1
  mov gs,bx
  jmp NEXT
BACK:
  fcom st2
  fnstsw ax
  sahf
  jnb NEXT     ;if t<tb { tb=t; pb@gs = current; }
  fst st2
  mov bp,bx
NEXT:
  fstp st0           ;|tf tb rd.y .x .z
  fcom
  fnstsw ax
  sahf         ;if tf>=fb { no_hit: cf=0; early exit } else { cf=1 }
  jnb EXIT
  lea bx,[bx+si]; don't overwrite carry
  loop I
EXIT:
  fcompp             ;|rd.y .x .z (get rid of 'tf tb')
  jc HIT
  call BGD
  ret
               ; carry=1

;Compute reflections from the front and back plane.
HIT:
  fldz               ;|0 rd.y .x .z

  mov cx,2
RF:add si,12    ;108 104 100
VMUL:add si,di   ;-4
  fld dword[bp+di]   ;|(rd|pb) 0 rd.y .x .z
  fmul dword[bp+si]  ;|(rd|pb)*pb.z 0 rd.y .x .z
  fsub st4           ;|R.z=(rd|pb)*pb.z-rd.z 0 rd.y .x .z
  jpo VMUL           ;|R.y R.x R.z 0 rd.y .x .z
  call BGD
  fmul st0
  faddp              ;|back .y .x .z                 ;|gem_color=front+back .y .x .z
;  fadd st0
  mov bp,gs
  loop RF      ; loop twice
  fsqrt        ; gamma correction
R:
  ret

;Environment map: checkerboard below, light above.
BGD:                 ;|y x z
  ftst
  fnstsw ax
  sahf
  jnb CHECKER
  fstp st1
  fstp st1
  fabs         ; the sky is just abs(y) (= y^2 without gamma)
  ret
CHECKER:
  fdiv st2,st0       ;|y x z/y
  fdiv st1,st0       ;|y x/y z/y
  fmul st0           ;|y^2 x/y z/y
  fxch st2           ;|u=z/y v=x/y y^2
  fild word[di+4-6]
  fidiv word[si]     
  faddp              ;|u+=T/16 v y^2
  pusha
  fadd st0
  fistp word[di+4-20-2-2] ; pushed_ax=u
  fadd st0
  fistp word[di+4-24-2-2] ; pushed_dx=v
  popa
  xor ax,dx
  and ax,1
  push ax
  fimul word[di+4-20-2-2] ; (u xor v)*y^2
  pop ax
  ret
